{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 数据的合并与连接\n",
    "\n",
    "* 在面对多个表进行分析的时候，通常需要对数据进行类join操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据的合并\n",
    "\n",
    "* 基于行标或是列标对齐而进行的数据整合（注意要么基于行标要么基于列标，不是同时基于两者）\n",
    "* 合并基本不改变原始数据的结构"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "a=np.array([[2,4],[7,1]])\n",
    "b=np.array([[3,5],[2,7]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j\n",
       "1  2  4\n",
       "2  7  1"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a1=pd.DataFrame(a,index=[1,2],columns=[\"i\",\"j\"])\n",
    "a1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a1.i[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j\n",
       "1  3  5\n",
       "2  2  7"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b1=pd.DataFrame(b,index=[1,2],columns=[\"i\",\"j\"])\n",
    "b1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j\n",
       "1  2  4\n",
       "2  7  1\n",
       "1  3  5\n",
       "2  2  7"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([a1,b1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j  i  j\n",
       "1  2  4  3  5\n",
       "2  7  1  2  7"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([a1,b1],axis=1)\n",
    "#一般来看，pandas与numpy在合并方面遵循一样的逻辑，但是注意数据类型的区别，\n",
    "#pandas是以自定义索引为优先，而numpy则只有顺序索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j\n",
       "0  2  3\n",
       "1  1  4\n",
       "2  5  7"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a2=pd.DataFrame([[2,3],[1,4],[5,7]],columns=[\"i\",\"j\"])\n",
    "a2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j\n",
       "0  2  3\n",
       "1  1  4\n",
       "2  5  7\n",
       "1  3  5\n",
       "2  2  7"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([a2,b1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>2.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j    i    j\n",
       "0  2  3  NaN  NaN\n",
       "1  1  4  3.0  5.0\n",
       "2  5  7  2.0  7.0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([a2,b1],axis=1)\n",
    "#本质上pandas的合并，是以合并方向上的键相同为基础的，当找不到匹配的键的时候会置空\n",
    "#并且concat并不排斥另外一个方向的索引重复"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j  i  j\n",
       "1  1  4  3  5\n",
       "2  5  7  2  7"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([a2,b1],axis=1,join=\"inner\")\n",
    "#如果希望他直接抛弃置空的所在行，则需要使用join这个参数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>2.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0  1    2    3\n",
       "0  2  3  NaN  NaN\n",
       "1  1  4  3.0  5.0\n",
       "2  5  7  2.0  7.0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([a2,b1],axis=1,ignore_index=True)\n",
    "#如果希望抛弃原来的索引，重建新的顺序索引的话可以用ignore_index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "p1=pd.concat([a2,b1],axis=1,keys=[\"a2\",\"b1\"])\n",
    "#如果希望仍然保留原来两个部分的结构的话，也可以利用如下的二层索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead tr th {\n",
       "        text-align: left;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th colspan=\"2\" halign=\"left\">a2</th>\n",
       "      <th colspan=\"2\" halign=\"left\">b1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>2.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  a2      b1     \n",
       "   i  j    i    j\n",
       "0  2  3  NaN  NaN\n",
       "1  1  4  3.0  5.0\n",
       "2  5  7  2.0  7.0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "p1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "#help(pd.concat)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 课间题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#已知\n",
    "a=np.array([[2,4],[7,1]])\n",
    "b=np.array([[3,5],[2,7]])\n",
    "#请自行将其转化为数据框，并进行横向和纵向的合并"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "a1=pd.DataFrame(a,index=[1,2],columns=[\"i\",\"j\"])\n",
    "b1=pd.DataFrame(b,index=[1,2],columns=[\"i\",\"j\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "      <th>i</th>\n",
       "      <th>j</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>7</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   i  j  i  j\n",
       "1  2  4  3  5\n",
       "2  7  1  2  7"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([a1,b1],axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 导入t_alibaba_data3数据，把\"06/04\"与\"07/04\"两天的数据提取出来纵向合并成一个数据框"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>brand</th>\n",
       "      <th>behavr</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       user  brand  behavr   date\n",
       "0  10944750  13451       0  06/04\n",
       "1  10944750  13451       2  06/04\n",
       "2  10944750  13451       2  06/04\n",
       "3  10944750  13451       0  06/04\n",
       "4  10944750  13451       0  06/04"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r1=pd.read_csv(r\"D:\\t_alibaba_data3.txt\",names=[\"user\",\"brand\",\"behavr\",\"date\"],sep=\"\\t\",dtype={\"behavr\":int})\n",
    "r1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#pd.concat([r1[r1.date==\"06/04\"],r1[r1.date==\"07/04\"]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 通过字符串的处理分离出月份序列，并把这个序列通过合并的方式作为一列并入r1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>brand</th>\n",
       "      <th>behavr</th>\n",
       "      <th>date</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>10944750</td>\n",
       "      <td>21110</td>\n",
       "      <td>0</td>\n",
       "      <td>06/07</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>10944750</td>\n",
       "      <td>8689</td>\n",
       "      <td>0</td>\n",
       "      <td>05/02</td>\n",
       "      <td>05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>10944750</td>\n",
       "      <td>8689</td>\n",
       "      <td>2</td>\n",
       "      <td>05/02</td>\n",
       "      <td>05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>10944750</td>\n",
       "      <td>8689</td>\n",
       "      <td>2</td>\n",
       "      <td>05/02</td>\n",
       "      <td>05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>10944750</td>\n",
       "      <td>8689</td>\n",
       "      <td>0</td>\n",
       "      <td>05/02</td>\n",
       "      <td>05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>10944750</td>\n",
       "      <td>8689</td>\n",
       "      <td>0</td>\n",
       "      <td>05/02</td>\n",
       "      <td>05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>10944750</td>\n",
       "      <td>26619</td>\n",
       "      <td>0</td>\n",
       "      <td>06/28</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>10944750</td>\n",
       "      <td>5185</td>\n",
       "      <td>0</td>\n",
       "      <td>07/10</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>10944750</td>\n",
       "      <td>18575</td>\n",
       "      <td>0</td>\n",
       "      <td>05/02</td>\n",
       "      <td>05</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>10944750</td>\n",
       "      <td>23662</td>\n",
       "      <td>0</td>\n",
       "      <td>06/19</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>10944750</td>\n",
       "      <td>23662</td>\n",
       "      <td>0</td>\n",
       "      <td>06/19</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>10944750</td>\n",
       "      <td>15761</td>\n",
       "      <td>0</td>\n",
       "      <td>04/24</td>\n",
       "      <td>04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>10944750</td>\n",
       "      <td>15761</td>\n",
       "      <td>0</td>\n",
       "      <td>04/24</td>\n",
       "      <td>04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>10944750</td>\n",
       "      <td>15761</td>\n",
       "      <td>0</td>\n",
       "      <td>04/24</td>\n",
       "      <td>04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>10944750</td>\n",
       "      <td>15761</td>\n",
       "      <td>0</td>\n",
       "      <td>04/24</td>\n",
       "      <td>04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>10944750</td>\n",
       "      <td>19673</td>\n",
       "      <td>0</td>\n",
       "      <td>07/05</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>10944750</td>\n",
       "      <td>19673</td>\n",
       "      <td>0</td>\n",
       "      <td>07/05</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>10944750</td>\n",
       "      <td>19673</td>\n",
       "      <td>0</td>\n",
       "      <td>07/05</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>10944750</td>\n",
       "      <td>19673</td>\n",
       "      <td>0</td>\n",
       "      <td>07/05</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>10944750</td>\n",
       "      <td>19673</td>\n",
       "      <td>0</td>\n",
       "      <td>07/04</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>10944750</td>\n",
       "      <td>19673</td>\n",
       "      <td>0</td>\n",
       "      <td>07/04</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>10944750</td>\n",
       "      <td>19673</td>\n",
       "      <td>1</td>\n",
       "      <td>07/05</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182850</th>\n",
       "      <td>847750</td>\n",
       "      <td>28169</td>\n",
       "      <td>0</td>\n",
       "      <td>08/09</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182851</th>\n",
       "      <td>847750</td>\n",
       "      <td>28169</td>\n",
       "      <td>1</td>\n",
       "      <td>08/09</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182852</th>\n",
       "      <td>847750</td>\n",
       "      <td>28169</td>\n",
       "      <td>1</td>\n",
       "      <td>08/14</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182853</th>\n",
       "      <td>847750</td>\n",
       "      <td>28169</td>\n",
       "      <td>0</td>\n",
       "      <td>08/14</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182854</th>\n",
       "      <td>847750</td>\n",
       "      <td>6193</td>\n",
       "      <td>0</td>\n",
       "      <td>08/10</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182855</th>\n",
       "      <td>847750</td>\n",
       "      <td>8327</td>\n",
       "      <td>0</td>\n",
       "      <td>08/04</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182856</th>\n",
       "      <td>847750</td>\n",
       "      <td>13682</td>\n",
       "      <td>0</td>\n",
       "      <td>07/20</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182857</th>\n",
       "      <td>847750</td>\n",
       "      <td>5640</td>\n",
       "      <td>0</td>\n",
       "      <td>08/08</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182858</th>\n",
       "      <td>847750</td>\n",
       "      <td>21838</td>\n",
       "      <td>0</td>\n",
       "      <td>07/23</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182859</th>\n",
       "      <td>847750</td>\n",
       "      <td>21838</td>\n",
       "      <td>0</td>\n",
       "      <td>07/23</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182860</th>\n",
       "      <td>847750</td>\n",
       "      <td>21838</td>\n",
       "      <td>0</td>\n",
       "      <td>07/23</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182861</th>\n",
       "      <td>847750</td>\n",
       "      <td>18990</td>\n",
       "      <td>0</td>\n",
       "      <td>08/04</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182862</th>\n",
       "      <td>847750</td>\n",
       "      <td>16607</td>\n",
       "      <td>0</td>\n",
       "      <td>08/14</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182863</th>\n",
       "      <td>847750</td>\n",
       "      <td>21659</td>\n",
       "      <td>0</td>\n",
       "      <td>07/16</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182864</th>\n",
       "      <td>847750</td>\n",
       "      <td>21659</td>\n",
       "      <td>0</td>\n",
       "      <td>07/16</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182865</th>\n",
       "      <td>847750</td>\n",
       "      <td>8662</td>\n",
       "      <td>0</td>\n",
       "      <td>08/15</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182866</th>\n",
       "      <td>847750</td>\n",
       "      <td>2655</td>\n",
       "      <td>0</td>\n",
       "      <td>08/15</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182867</th>\n",
       "      <td>847750</td>\n",
       "      <td>27245</td>\n",
       "      <td>0</td>\n",
       "      <td>08/07</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182868</th>\n",
       "      <td>847750</td>\n",
       "      <td>10232</td>\n",
       "      <td>0</td>\n",
       "      <td>08/13</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182869</th>\n",
       "      <td>847750</td>\n",
       "      <td>27406</td>\n",
       "      <td>0</td>\n",
       "      <td>07/19</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182870</th>\n",
       "      <td>847750</td>\n",
       "      <td>27406</td>\n",
       "      <td>0</td>\n",
       "      <td>07/19</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182871</th>\n",
       "      <td>847750</td>\n",
       "      <td>27406</td>\n",
       "      <td>0</td>\n",
       "      <td>07/19</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182872</th>\n",
       "      <td>847750</td>\n",
       "      <td>11080</td>\n",
       "      <td>0</td>\n",
       "      <td>08/13</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182873</th>\n",
       "      <td>847750</td>\n",
       "      <td>11080</td>\n",
       "      <td>0</td>\n",
       "      <td>08/13</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182874</th>\n",
       "      <td>847750</td>\n",
       "      <td>5742</td>\n",
       "      <td>0</td>\n",
       "      <td>08/04</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182875</th>\n",
       "      <td>847750</td>\n",
       "      <td>26631</td>\n",
       "      <td>0</td>\n",
       "      <td>07/30</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182876</th>\n",
       "      <td>847750</td>\n",
       "      <td>26631</td>\n",
       "      <td>0</td>\n",
       "      <td>07/30</td>\n",
       "      <td>07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182877</th>\n",
       "      <td>847750</td>\n",
       "      <td>2845</td>\n",
       "      <td>0</td>\n",
       "      <td>08/12</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182878</th>\n",
       "      <td>847750</td>\n",
       "      <td>5317</td>\n",
       "      <td>0</td>\n",
       "      <td>08/08</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>182879</th>\n",
       "      <td>847750</td>\n",
       "      <td>22353</td>\n",
       "      <td>0</td>\n",
       "      <td>08/08</td>\n",
       "      <td>08</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>182880 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            user  brand  behavr   date   0\n",
       "0       10944750  13451       0  06/04  06\n",
       "1       10944750  13451       2  06/04  06\n",
       "2       10944750  13451       2  06/04  06\n",
       "3       10944750  13451       0  06/04  06\n",
       "4       10944750  13451       0  06/04  06\n",
       "5       10944750  13451       0  06/04  06\n",
       "6       10944750  13451       0  06/04  06\n",
       "7       10944750  13451       0  06/04  06\n",
       "8       10944750  21110       0  06/07  06\n",
       "9       10944750   8689       0  05/02  05\n",
       "10      10944750   8689       2  05/02  05\n",
       "11      10944750   8689       2  05/02  05\n",
       "12      10944750   8689       0  05/02  05\n",
       "13      10944750   8689       0  05/02  05\n",
       "14      10944750  26619       0  06/28  06\n",
       "15      10944750   5185       0  07/10  07\n",
       "16      10944750  18575       0  05/02  05\n",
       "17      10944750  23662       0  06/19  06\n",
       "18      10944750  23662       0  06/19  06\n",
       "19      10944750  15761       0  04/24  04\n",
       "20      10944750  15761       0  04/24  04\n",
       "21      10944750  15761       0  04/24  04\n",
       "22      10944750  15761       0  04/24  04\n",
       "23      10944750  19673       0  07/05  07\n",
       "24      10944750  19673       0  07/05  07\n",
       "25      10944750  19673       0  07/05  07\n",
       "26      10944750  19673       0  07/05  07\n",
       "27      10944750  19673       0  07/04  07\n",
       "28      10944750  19673       0  07/04  07\n",
       "29      10944750  19673       1  07/05  07\n",
       "...          ...    ...     ...    ...  ..\n",
       "182850    847750  28169       0  08/09  08\n",
       "182851    847750  28169       1  08/09  08\n",
       "182852    847750  28169       1  08/14  08\n",
       "182853    847750  28169       0  08/14  08\n",
       "182854    847750   6193       0  08/10  08\n",
       "182855    847750   8327       0  08/04  08\n",
       "182856    847750  13682       0  07/20  07\n",
       "182857    847750   5640       0  08/08  08\n",
       "182858    847750  21838       0  07/23  07\n",
       "182859    847750  21838       0  07/23  07\n",
       "182860    847750  21838       0  07/23  07\n",
       "182861    847750  18990       0  08/04  08\n",
       "182862    847750  16607       0  08/14  08\n",
       "182863    847750  21659       0  07/16  07\n",
       "182864    847750  21659       0  07/16  07\n",
       "182865    847750   8662       0  08/15  08\n",
       "182866    847750   2655       0  08/15  08\n",
       "182867    847750  27245       0  08/07  08\n",
       "182868    847750  10232       0  08/13  08\n",
       "182869    847750  27406       0  07/19  07\n",
       "182870    847750  27406       0  07/19  07\n",
       "182871    847750  27406       0  07/19  07\n",
       "182872    847750  11080       0  08/13  08\n",
       "182873    847750  11080       0  08/13  08\n",
       "182874    847750   5742       0  08/04  08\n",
       "182875    847750  26631       0  07/30  07\n",
       "182876    847750  26631       0  07/30  07\n",
       "182877    847750   2845       0  08/12  08\n",
       "182878    847750   5317       0  08/08  08\n",
       "182879    847750  22353       0  08/08  08\n",
       "\n",
       "[182880 rows x 5 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.concat([r1,r1.date.str.split(\"/\",expand=True)[0]],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>brand</th>\n",
       "      <th>behavr</th>\n",
       "      <th>date</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>06</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       user  brand  behavr   date   0\n",
       "0  10944750  13451       0  06/04  06\n",
       "1  10944750  13451       2  06/04  06\n",
       "2  10944750  13451       2  06/04  06\n",
       "3  10944750  13451       0  06/04  06\n",
       "4  10944750  13451       0  06/04  06"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 特征矩阵的构建方法\n",
    "\n",
    "* 通常会分别统计各个维度的特征，然后合并成一个数据框"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "brand\n",
       "19        1\n",
       "22        1\n",
       "29        3\n",
       "43        1\n",
       "57        1\n",
       "62        1\n",
       "83       10\n",
       "141       1\n",
       "143       8\n",
       "155      16\n",
       "178       3\n",
       "194       1\n",
       "201       1\n",
       "202       1\n",
       "207       1\n",
       "215       1\n",
       "217       6\n",
       "239      12\n",
       "241       7\n",
       "251      10\n",
       "255       4\n",
       "259       2\n",
       "264       1\n",
       "360       3\n",
       "377       1\n",
       "398       4\n",
       "414       2\n",
       "422       1\n",
       "437       1\n",
       "454       1\n",
       "         ..\n",
       "29095     1\n",
       "29099    32\n",
       "29146     2\n",
       "29197     3\n",
       "29202     6\n",
       "29203    14\n",
       "29213     2\n",
       "29224     3\n",
       "29225     2\n",
       "29231     3\n",
       "29265     2\n",
       "29330     2\n",
       "29331     1\n",
       "29334     1\n",
       "29363     2\n",
       "29364     2\n",
       "29375     1\n",
       "29385     1\n",
       "29389     1\n",
       "29442     1\n",
       "29445     3\n",
       "29446     3\n",
       "29465     4\n",
       "29472     2\n",
       "29473     2\n",
       "29490     2\n",
       "29505     1\n",
       "29539     1\n",
       "29547     7\n",
       "29551     1\n",
       "Length: 2149, dtype: int64"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r1[r1.behavr==1].groupby(\"brand\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>浏览</th>\n",
       "      <th>购买</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>brand</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>6.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>33.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>9.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>21.0</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>8.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>57</th>\n",
       "      <td>9.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>58</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>62</th>\n",
       "      <td>12.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>72</th>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>83</th>\n",
       "      <td>8.0</td>\n",
       "      <td>10.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>90</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>104</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>4.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>116</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>119</th>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>120</th>\n",
       "      <td>9.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>121</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29463</th>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29464</th>\n",
       "      <td>29.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29465</th>\n",
       "      <td>63.0</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29472</th>\n",
       "      <td>19.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29473</th>\n",
       "      <td>98.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29474</th>\n",
       "      <td>10.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29475</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29477</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29486</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29487</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29488</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29490</th>\n",
       "      <td>71.0</td>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29491</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29493</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29497</th>\n",
       "      <td>84.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29501</th>\n",
       "      <td>8.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29502</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29504</th>\n",
       "      <td>5.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29505</th>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29509</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29524</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29528</th>\n",
       "      <td>10.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29529</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29531</th>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29538</th>\n",
       "      <td>3.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29539</th>\n",
       "      <td>9.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29541</th>\n",
       "      <td>16.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29547</th>\n",
       "      <td>357.0</td>\n",
       "      <td>7.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29551</th>\n",
       "      <td>9.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29552</th>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>9526 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          浏览    购买\n",
       "brand             \n",
       "11       4.0   NaN\n",
       "15       6.0   NaN\n",
       "18       1.0   NaN\n",
       "19       2.0   1.0\n",
       "20       5.0   NaN\n",
       "22      33.0   1.0\n",
       "23       1.0   NaN\n",
       "26       9.0   NaN\n",
       "27       4.0   NaN\n",
       "29      21.0   3.0\n",
       "33       1.0   NaN\n",
       "38       1.0   NaN\n",
       "39       3.0   NaN\n",
       "43       8.0   1.0\n",
       "45       1.0   NaN\n",
       "53       2.0   NaN\n",
       "57       9.0   1.0\n",
       "58       1.0   NaN\n",
       "62      12.0   1.0\n",
       "72       4.0   NaN\n",
       "83       8.0  10.0\n",
       "90       3.0   NaN\n",
       "91       3.0   NaN\n",
       "103      4.0   NaN\n",
       "104      1.0   NaN\n",
       "111      4.0   NaN\n",
       "116      1.0   NaN\n",
       "119      2.0   NaN\n",
       "120      9.0   NaN\n",
       "121      1.0   NaN\n",
       "...      ...   ...\n",
       "29463    2.0   NaN\n",
       "29464   29.0   NaN\n",
       "29465   63.0   4.0\n",
       "29472   19.0   2.0\n",
       "29473   98.0   2.0\n",
       "29474   10.0   NaN\n",
       "29475    1.0   NaN\n",
       "29477    1.0   NaN\n",
       "29486    1.0   NaN\n",
       "29487    1.0   NaN\n",
       "29488    1.0   NaN\n",
       "29490   71.0   2.0\n",
       "29491    1.0   NaN\n",
       "29493    3.0   NaN\n",
       "29497   84.0   NaN\n",
       "29501    8.0   NaN\n",
       "29502    1.0   NaN\n",
       "29504    5.0   NaN\n",
       "29505    7.0   1.0\n",
       "29509    1.0   NaN\n",
       "29524    3.0   NaN\n",
       "29528   10.0   NaN\n",
       "29529    3.0   NaN\n",
       "29531    1.0   NaN\n",
       "29538    3.0   NaN\n",
       "29539    9.0   1.0\n",
       "29541   16.0   NaN\n",
       "29547  357.0   7.0\n",
       "29551    9.0   1.0\n",
       "29552    2.0   NaN\n",
       "\n",
       "[9526 rows x 2 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.DataFrame({\"浏览\":r1[r1.behavr==0].groupby(\"brand\").size(),\"购买\":r1[r1.behavr==1].groupby(\"brand\").size()})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据的连接\n",
    "\n",
    "* 基于值的数据合并\n",
    "* merge也可以基于键但是不推荐"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 一对一"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items\n",
       "0  001   104\n",
       "1  002   101\n",
       "2  003   103\n",
       "3  004   102"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a2=pd.DataFrame([[\"001\",\"104\"],[\"002\",\"101\"],[\"003\",\"103\"],[\"004\",\"102\"]],columns=[\"user\",\"items\"])\n",
    "a2\n",
    "#用户购买了商品"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>seller</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>s01</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>s02</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>s03</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>s04</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  seller items\n",
       "0    s01   101\n",
       "1    s02   103\n",
       "2    s03   102\n",
       "3    s04   104"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b2=pd.DataFrame([[\"s01\",\"101\"],[\"s02\",\"103\"],[\"s03\",\"102\"],[\"s04\",\"104\"]],columns=[\"seller\",\"items\"])\n",
    "b2\n",
    "#商品属于商家"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>s03</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  004   102    s03"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a2,b2)\n",
    "#一对一的合并就是按照对应列的值相同的原则（不再是找相同索引），把对应的数据拼接起来"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 一对多\n",
    "\n",
    "* 多对多类似，只不过两边都复制"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>003</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items\n",
       "0  001   104\n",
       "1  002   101\n",
       "2  003   103\n",
       "3  003   102\n",
       "4  004   102"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a3=pd.DataFrame([[\"001\",\"104\"],[\"002\",\"101\"],[\"003\",\"103\"],[\"003\",\"102\"],[\"004\",\"102\"]],columns=[\"user\",\"items\"])\n",
    "a3\n",
    "#假设某个用户购买了多个商品"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>003</td>\n",
       "      <td>102</td>\n",
       "      <td>s03</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>s03</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  003   102    s03\n",
       "4  004   102    s03"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a3,b2)\n",
    "#有重复的一方会指导缺记录的一方进行复制"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 当两列数据并不完全对应时\n",
    "\n",
    "* 默认抛弃不一致的\n",
    "* 也可以通过how=\"outer\"或者\"left\"等方法设置保留方\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items\n",
       "0  001   104\n",
       "1  002   101\n",
       "2  003   103\n",
       "3  004   102"
      ]
     },
     "execution_count": 74,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>seller</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>s01</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>s02</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>s05</td>\n",
       "      <td>105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>s04</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  seller items\n",
       "0    s01   101\n",
       "1    s02   103\n",
       "2    s05   105\n",
       "3    s04   104"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b4=pd.DataFrame([[\"s01\",\"101\"],[\"s02\",\"103\"],[\"s05\",\"105\"],[\"s04\",\"104\"]],columns=[\"seller\",\"items\"])\n",
    "b4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a2,b4)\n",
    "#默认情况下匹配不上的行会丢掉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>105</td>\n",
       "      <td>s05</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  004   102    NaN\n",
       "4  NaN   105    s05"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a2,b4,how=\"outer\")\n",
    "#可以设置不丢掉"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  004   102    NaN"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a2,b4,how=\"left\")\n",
    "#也可以设置保留哪一边"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 当列标不一致时，指定进行连接的基列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>s03</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  004   102    s03"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a2,b2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>s03</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  004   102    s03"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a2,b2,on=\"items\")\n",
    "#merge默认会按照名称一样的列进行连接，当有多个一样的列表，则需要指定"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items\n",
       "0  001   104\n",
       "1  002   101\n",
       "2  003   103\n",
       "3  004   102"
      ]
     },
     "execution_count": 64,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>seller</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>s01</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>s02</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>s03</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>s04</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  seller   id\n",
       "0    s01  101\n",
       "1    s02  103\n",
       "2    s03  102\n",
       "3    s04  104"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b21=pd.DataFrame([[\"s01\",\"101\"],[\"s02\",\"103\"],[\"s03\",\"102\"],[\"s04\",\"104\"]],columns=[\"seller\",\"id\"])\n",
    "b21\n",
    "#商品的列标不一致"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "      <th>id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>s03</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller   id\n",
       "0  001   104    s04  104\n",
       "1  002   101    s01  101\n",
       "2  003   103    s02  103\n",
       "3  004   102    s03  102"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f1=pd.merge(a2,b21,left_on=\"items\",right_on=\"id\")\n",
    "f1\n",
    "#这种方法会出现重复列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "      <td>s03</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  004   102    s03"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "f1.drop(\"id\",axis=1)\n",
    "#可以通过drop去掉"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 课间题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#已知a2,b4如下\n",
    "a2=pd.DataFrame([[\"001\",\"104\"],[\"002\",\"101\"],[\"003\",\"103\"],[\"004\",\"102\"]],columns=[\"user\",\"items\"])\n",
    "b4=pd.DataFrame([[\"s01\",\"101\"],[\"s02\",\"103\"],[\"s05\",\"105\"],[\"s04\",\"104\"]],columns=[\"seller\",\"items\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>004</td>\n",
       "      <td>102</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items\n",
       "0  001   104\n",
       "1  002   101\n",
       "2  003   103\n",
       "3  004   102"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>seller</th>\n",
       "      <th>items</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>s01</td>\n",
       "      <td>101</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>s02</td>\n",
       "      <td>103</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>s05</td>\n",
       "      <td>105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>s04</td>\n",
       "      <td>104</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  seller items\n",
       "0    s01   101\n",
       "1    s02   103\n",
       "2    s05   105\n",
       "3    s04   104"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1,基于items连接a2 b4 不用显示匹配不上的行，2 连接两者，不丢掉置空行，3 连接两者，保留b4的置空行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>items</th>\n",
       "      <th>seller</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>001</td>\n",
       "      <td>104</td>\n",
       "      <td>s04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>002</td>\n",
       "      <td>101</td>\n",
       "      <td>s01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>003</td>\n",
       "      <td>103</td>\n",
       "      <td>s02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>105</td>\n",
       "      <td>s05</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  user items seller\n",
       "0  001   104    s04\n",
       "1  002   101    s01\n",
       "2  003   103    s02\n",
       "3  NaN   105    s05"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(a2,b4,how=\"right\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 求出各个用户的购买总量与浏览总量，并把两者合并为一个特征矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "g1=r1[r1.behavr==1].groupby(\"user\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "l1=r1[r1.behavr==0].groupby(\"user\").size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "tz=pd.DataFrame({\"g1\":g1,\"l1\":l1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>g1</th>\n",
       "      <th>l1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19500</th>\n",
       "      <td>10.0</td>\n",
       "      <td>305.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29750</th>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38250</th>\n",
       "      <td>10.0</td>\n",
       "      <td>393.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39750</th>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42000</th>\n",
       "      <td>3.0</td>\n",
       "      <td>255.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         g1     l1\n",
       "user              \n",
       "19500  10.0  305.0\n",
       "29750   NaN   14.0\n",
       "38250  10.0  393.0\n",
       "39750   NaN   14.0\n",
       "42000   3.0  255.0"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tz.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>brand</th>\n",
       "      <th>behavr</th>\n",
       "      <th>date</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       user  brand  behavr   date\n",
       "0  10944750  13451       0  06/04\n",
       "1  10944750  13451       2  06/04\n",
       "2  10944750  13451       2  06/04\n",
       "3  10944750  13451       0  06/04\n",
       "4  10944750  13451       0  06/04"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "r1.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19500</th>\n",
       "      <td>10.0</td>\n",
       "      <td>305.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29750</th>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38250</th>\n",
       "      <td>10.0</td>\n",
       "      <td>393.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39750</th>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42000</th>\n",
       "      <td>3.0</td>\n",
       "      <td>255.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          a      b\n",
       "user              \n",
       "19500  10.0  305.0\n",
       "29750   NaN   14.0\n",
       "38250  10.0  393.0\n",
       "39750   NaN   14.0\n",
       "42000   3.0  255.0"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 请自行百度，把上面特征矩阵的索引变为列"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>g1</th>\n",
       "      <th>l1</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>user</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19500</th>\n",
       "      <td>10.0</td>\n",
       "      <td>305.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29750</th>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38250</th>\n",
       "      <td>10.0</td>\n",
       "      <td>393.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39750</th>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42000</th>\n",
       "      <td>3.0</td>\n",
       "      <td>255.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         g1     l1\n",
       "user              \n",
       "19500  10.0  305.0\n",
       "29750   NaN   14.0\n",
       "38250  10.0  393.0\n",
       "39750   NaN   14.0\n",
       "42000   3.0  255.0"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tz.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "tz.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>g1</th>\n",
       "      <th>l1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19500</td>\n",
       "      <td>10.0</td>\n",
       "      <td>305.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>29750</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38250</td>\n",
       "      <td>10.0</td>\n",
       "      <td>393.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>39750</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>42000</td>\n",
       "      <td>3.0</td>\n",
       "      <td>255.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    user    g1     l1\n",
       "0  19500  10.0  305.0\n",
       "1  29750   NaN   14.0\n",
       "2  38250  10.0  393.0\n",
       "3  39750   NaN   14.0\n",
       "4  42000   3.0  255.0"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tz.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>g1</th>\n",
       "      <th>l1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>19500</td>\n",
       "      <td>10.0</td>\n",
       "      <td>305.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>29750</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>38250</td>\n",
       "      <td>10.0</td>\n",
       "      <td>393.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>39750</td>\n",
       "      <td>NaN</td>\n",
       "      <td>14.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>42000</td>\n",
       "      <td>3.0</td>\n",
       "      <td>255.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    user    g1     l1\n",
       "0  19500  10.0  305.0\n",
       "1  29750   NaN   14.0\n",
       "2  38250  10.0  393.0\n",
       "3  39750   NaN   14.0\n",
       "4  42000   3.0  255.0"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tz.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* 请基于user连接特征矩阵与原始记录数据r1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>brand</th>\n",
       "      <th>behavr</th>\n",
       "      <th>date</th>\n",
       "      <th>g1</th>\n",
       "      <th>l1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       user  brand  behavr   date   g1     l1\n",
       "0  10944750  13451       0  06/04  6.0  212.0\n",
       "1  10944750  13451       2  06/04  6.0  212.0\n",
       "2  10944750  13451       2  06/04  6.0  212.0\n",
       "3  10944750  13451       0  06/04  6.0  212.0\n",
       "4  10944750  13451       0  06/04  6.0  212.0"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(r1,tz).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>user</th>\n",
       "      <th>brand</th>\n",
       "      <th>behavr</th>\n",
       "      <th>date</th>\n",
       "      <th>g1</th>\n",
       "      <th>l1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>2</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>10944750</td>\n",
       "      <td>13451</td>\n",
       "      <td>0</td>\n",
       "      <td>06/04</td>\n",
       "      <td>6.0</td>\n",
       "      <td>212.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       user  brand  behavr   date   g1     l1\n",
       "0  10944750  13451       0  06/04  6.0  212.0\n",
       "1  10944750  13451       2  06/04  6.0  212.0\n",
       "2  10944750  13451       2  06/04  6.0  212.0\n",
       "3  10944750  13451       0  06/04  6.0  212.0\n",
       "4  10944750  13451       0  06/04  6.0  212.0"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.merge(r1,tz).head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
